The code used for processing the files and working on them is available in the eta_modules folder to separate most of the implementation details from the presentation of results. I try to show additional relevant functionality beyond pure results to give a better indication of the process of arriving at these results.
import os
from glob import glob
import numpy as np
import pandas as pd
import nltk
from eta_modules.preprocessing import Document, Corpus
from eta_modules.analysis import HierarchicalClusterAnalysis, PCA, TopicModel, WordEmbedding, SentimentAnalysis
C:\Users\David\miniconda3\envs\ds\lib\site-packages\gensim\similarities\__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning. warnings.warn(msg)
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
We start by loading in the XML files for each work and parsing them to a reasonable degree with BeautifulSoup and NLTK.
Since these works are all either plays or poems/epics, the concept of a "chapter" or "paragraph" doesn't translate perfectly compared to, e.g., a novel. However, the Perseus Digital Library (where these files are sourced from) has added at least top-level divisions to break up texts. In some cases, these divisions truly exist in the text (for example, The Iliad is broken into 24 books); in other cases, like plays, these divisions don't seem to be directly present in the text, but are akin to something like a "scene". I've considered all of these largest divisions as "chapters".
To get at something like a "paragraph", I used a different approach based on whether the work was a play or not:
I've included code/functions that are able to either parse the XML files from their initial state or load in pre-computed Corpus tables to speed up notebook computations.
def parse_corpus(docpaths, tfidf_bag=['work_id', 'chapter_id'], tfidf_methods=['n', 'max', 'bool']):
doc_list = []
for path in docpaths:
doc = Document(path)
doc.parse_text_to_paras()
doc.tokenize(remove_pos_tuple=True, remove_ws=True)
doc_list.append(doc)
corpus = Corpus(doc_list)
corpus.extract_annotate_vocab()
corpus.compute_tfidf(OHCO_level=tfidf_bag, methods=tfidf_methods)
return doc_list, corpus
# Annotating the vocabulary uses NLTK and is slow since it's not a vectorized operation, so it's much faster to reload the tables if they already exist
# Still recompute TFIDF since bags/methods can differ between calculations
def load_corpus(docpaths=None, table_dir=None, tfidf_bag=['work_id', 'chapter_id'], tfidf_methods=['n', 'max', 'bool']):
if table_dir is not None:
corpus_filepaths = glob(os.path.join(table_dir, '*.csv'))
if len(corpus_filepaths) != 0:
corpus = Corpus()
corpus.load_tables(table_dir)
corpus.compute_tfidf(OHCO_level=tfidf_bag, methods=tfidf_methods)
output = ([], corpus)
elif docpaths is not None:
doc_list, corpus = parse_corpus(docpaths=docpaths, tfidf_bag=tfidf_bag, tfidf_methods=tfidf_methods)
output = (doc_list, corpus)
return output
root_dir = os.path.abspath('..')
data_dir = os.path.join(root_dir, 'data')
# Make output directory if it does not exist
output_dir = os.path.join(data_dir, 'outputs')
try:
os.mkdir(output_dir)
except FileExistsError:
pass
docpaths = glob(os.path.join(data_dir, 'raw', '**', '*.xml'), recursive=True) # Get all XML file paths
table_dir = os.path.join(output_dir, 'corpus')
OHCO = ['work_id', 'chapter_id', 'para_id', 'sent_id']
book_bag = OHCO[:1]
chapter_bag = OHCO[:2]
paragraph_bag = OHCO[:3]
# Load in pre-existing tables
# doc_list, corp = load_corpus(table_dir=table_dir, tfidf_bag=chapter_bag) # doc_list is empty here, but it's not important for any further analyses; it's mostly useful for printing out sections of text (if you so desired)
# Parse from scratch
doc_list, corp = load_corpus(docpaths=docpaths, tfidf_bag=chapter_bag)
corp.lib
| author | title | path | |
|---|---|---|---|
| work_id | |||
| 0 | Aeschylus | Agamemnon | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 1 | Aeschylus | Eumenides | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 2 | Aeschylus | Libation Bearers | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 3 | Aeschylus | Prometheus Bound | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 4 | Euripides | Bacchae | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 5 | Euripides | Iphigenia in Aulis | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 6 | Euripides | The Trojan Women | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 7 | Hesiod | Theogony | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 8 | Hesiod | Works and Days | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 9 | Homer | Iliad | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 10 | Homer | Odyssey | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 11 | Homeric_Hymns | Homeric Hymns | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 12 | Ovid | Metamorphoses | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 13 | Sophocles | Ajax | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 14 | Sophocles | Antigone | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 15 | Sophocles | Electra | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 16 | Sophocles | Oedipus at Colonus | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 17 | Sophocles | Oedipus Tyrannus | C:\Users\David\Documents\GitHub\latin-greek-te... |
| 18 | Vergil | Aeneid | C:\Users\David\Documents\GitHub\latin-greek-te... |
corp.token
| pos | token_str | term_str | |||||
|---|---|---|---|---|---|---|---|
| work_id | chapter_id | para_id | sent_id | token_id | |||
| 0 | 0 | 0 | 0 | 0 | NNP | Watchman: | watchman |
| 1 | NNP | Release | release | ||||
| 2 | IN | from | from | ||||
| 3 | DT | this | this | ||||
| 4 | JJ | weary | weary | ||||
| ... | ... | ... | ... | ... | ... | ... | ... |
| 18 | 11 | 29 | 13 | 14 | NN | wrath | wrath |
| 15 | TO | to | to | ||||
| 16 | VB | darkness | darkness | ||||
| 17 | JJ | fled | fled | ||||
| 18 | NN | away. | away |
740470 rows × 3 columns
n: raw sum of term occurrences within a document (a document can be a book, chapter, paragraph, etc., depending on the chosen bag)max: Maximum term frequency normalization: $tf_{t,d} = 0.4 + 0.6 \frac{tf_{t,d}}{tf_{max}(d)}$; divide raw count of term in document by raw count of most frequent term in a document. This can help prevent bias towards longer documentsbool: binary frequencies; term either does or does not occur in documentA chapter-level bag tends to give more significant nouns, while a paragraph-level bag or smaller will likely result in more terms like pronouns being significant.
corp.vocab.sort_values('tfidf_bool_sum', ascending=False).head(10)
| n | stop | p_stem | pos_max | df | idf | tfidf_n_sum | tfidf_max_sum | tfidf_bool_sum | |
|---|---|---|---|---|---|---|---|---|---|
| term_str | |||||||||
| sing | 87 | 0 | sing | VBG | 58 | 2.208108 | 1.816905 | 4.661218 | 20.941718 |
| hail | 96 | 0 | hail | NNP | 66 | 2.021695 | 1.835607 | 4.764603 | 18.136252 |
| song | 172 | 0 | song | NN | 81 | 1.726239 | 2.808162 | 5.019914 | 15.020710 |
| begin | 51 | 0 | begin | VB | 38 | 2.818162 | 1.359342 | 3.795364 | 13.793796 |
| chorus | 841 | 0 | choru | NNP | 138 | 0.957565 | 7.616525 | 5.562277 | 12.176520 |
| zeus | 1270 | 0 | zeu | NNP | 170 | 0.656698 | 7.887916 | 4.092533 | 9.574987 |
| muse | 22 | 0 | muse | NNP | 18 | 3.896164 | 0.810686 | 2.520064 | 9.125414 |
| lovely | 152 | 0 | love | JJ | 53 | 2.338169 | 3.361338 | 4.336112 | 8.647434 |
| bare | 173 | 0 | bare | NN | 63 | 2.088809 | 3.417729 | 4.530909 | 8.549576 |
| lord | 380 | 0 | lord | NN | 149 | 0.846921 | 3.043821 | 4.374682 | 8.539977 |
Basic process:
max_features-long subset using DF-IDF (document frequency * inverse-document frequency -- works as a measure of global term significance)hca.tfidf)cosine, and jaccard)hca = HierarchicalClusterAnalysis(max_features=6000, tfidf_method='max', OHCO_level=chapter_bag)
hca.fit(corp, metrics=['cosine', 'jaccard'])
hca.tfidf.shape
(19, 6000)
hca.tfidf.head()
| term_str | escape | course | deed | wise | days | truth | nothing | tears | ships | hope | ... | cleared | existence | wreak | smithy | shadowed | chords | corded | dreary | flashes | muscles |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| work_id | |||||||||||||||||||||
| 0 | 0.157268 | 0.198696 | 0.328274 | 0.117703 | 0.206300 | 0.239128 | 0.204890 | 0.359166 | 0.158842 | 0.248154 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 |
| 1 | 0.146918 | 0.100943 | 0.199575 | 0.049453 | 0.048562 | 0.051170 | 0.251594 | 0.049749 | 0.000000 | 0.000000 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 |
| 2 | 0.219391 | 0.274928 | 0.337530 | 0.000000 | 0.106757 | 0.273227 | 0.108723 | 0.165983 | 0.000000 | 0.219631 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 |
| 3 | 0.232349 | 0.220720 | 0.107114 | 0.108874 | 0.052857 | 0.231516 | 0.278479 | 0.149096 | 0.053396 | 0.053862 | ... | 0.207856 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.203409 | 0.203525 | 0.0 |
| 4 | 0.091184 | 0.136879 | 0.091675 | 0.233985 | 0.000000 | 0.045688 | 0.090815 | 0.093825 | 0.000000 | 0.091845 | ... | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 |
5 rows × 6000 columns
# TF-IDF values used for Jaccard
hca.tfidf.astype('bool').astype('int').head()
| term_str | escape | course | deed | wise | days | truth | nothing | tears | ships | hope | ... | cleared | existence | wreak | smithy | shadowed | chords | corded | dreary | flashes | muscles |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| work_id | |||||||||||||||||||||
| 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 |
| 4 | 1 | 1 | 1 | 1 | 0 | 1 | 1 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 6000 columns
Overall, both metrics lead to fairly good segmentation, both by author and between plays (Aeschylus, Sophocles, and Euripides) vs. non-plays. Additionally, with both metrics, both of the Roman works (by Vergil and Ovid) have a high degree of similarity to each other, and they are also grouped together with both of Homer's works, which likely indicates some artistic influence. The Aeneid certainly takes inspiration from the Iliad and the Odyssey.
hca.plot_dendrogram(linkage='complete', color_thresh=0.6, figsize=(8, 10))
Basic process:
max_features x max_features matrix)pca = PCA(max_features=6000, tfidf_method='max', OHCO_level=chapter_bag)
pca.fit(corp, n_components=10)
pca.dcm
| pc_id | PC0 | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| author | work_id | chapter_id | ||||||||||
| Aeschylus | 0 | 0 | 0.057497 | -0.013661 | 0.010521 | 0.053873 | -0.075172 | 0.077731 | 0.047608 | 0.047069 | -0.011809 | -0.000315 |
| 1 | 0.053718 | -0.018887 | 0.016128 | 0.036141 | -0.053561 | 0.086005 | 0.027213 | 0.035085 | -0.026227 | 0.051715 | ||
| 2 | 0.043588 | -0.034729 | -0.002787 | 0.036574 | -0.070483 | 0.108517 | 0.053821 | 0.057228 | -0.057343 | 0.039432 | ||
| 3 | 0.071665 | -0.019244 | -0.021010 | 0.024067 | -0.063857 | 0.033064 | 0.065676 | 0.084621 | -0.026635 | 0.041619 | ||
| 4 | 0.023816 | -0.068131 | -0.086996 | 0.002716 | -0.069921 | 0.112674 | -0.006328 | 0.022679 | -0.062630 | 0.018398 | ||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Vergil | 18 | 7 | -0.251425 | -0.298854 | 0.219959 | 0.037499 | -0.105356 | -0.022902 | 0.163190 | -0.116983 | -0.095848 | -0.087087 |
| 8 | -0.277723 | -0.281795 | 0.191773 | 0.093623 | -0.115714 | -0.001080 | 0.136849 | -0.144259 | -0.072943 | -0.122531 | ||
| 9 | -0.278448 | -0.303302 | 0.193671 | 0.100347 | -0.149359 | -0.001530 | 0.161956 | -0.153768 | -0.074974 | -0.104072 | ||
| 10 | -0.267634 | -0.314467 | 0.163282 | 0.092486 | -0.119070 | -0.000040 | 0.167748 | -0.133087 | -0.094929 | -0.122952 | ||
| 11 | -0.289986 | -0.299875 | 0.170745 | 0.101264 | -0.126053 | -0.040280 | 0.146078 | -0.138442 | -0.100881 | -0.096678 |
268 rows × 10 columns
pca.loadings
| pc_id | PC0 | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 |
|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||
| escape | -0.010120 | -0.002449 | -0.027795 | 0.005076 | 0.004283 | -0.004381 | -0.005445 | 0.008432 | -0.009285 | 0.008761 |
| course | -0.008573 | -0.011078 | -0.017529 | -0.002960 | -0.007751 | 0.008520 | 0.009102 | 0.006608 | 0.001857 | 0.014243 |
| deed | 0.000557 | -0.011832 | -0.031021 | 0.001822 | -0.006226 | -0.015055 | -0.004993 | 0.002886 | -0.032243 | 0.008116 |
| wise | -0.015591 | 0.017736 | -0.003561 | -0.012605 | 0.016556 | 0.004607 | 0.001657 | -0.012111 | -0.010527 | -0.004924 |
| days | -0.014480 | -0.005263 | -0.020372 | -0.011556 | -0.013908 | 0.000277 | 0.000018 | -0.002721 | 0.002265 | 0.006737 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| chords | -0.005593 | -0.001430 | -0.005340 | -0.016261 | -0.006412 | -0.002907 | -0.002382 | 0.008008 | -0.005154 | -0.004271 |
| corded | -0.011193 | 0.010994 | -0.006903 | -0.018157 | -0.005729 | 0.000658 | 0.009186 | 0.001171 | -0.000573 | -0.000514 |
| dreary | -0.002619 | -0.011666 | 0.003299 | 0.001401 | -0.000670 | 0.004587 | -0.001726 | 0.002471 | 0.009122 | 0.012241 |
| flashes | 0.002311 | -0.005922 | -0.004316 | 0.004263 | -0.007960 | -0.016765 | 0.002202 | -0.025626 | 0.003306 | 0.002177 |
| muscles | -0.006407 | -0.010455 | 0.004431 | 0.000872 | 0.006983 | -0.005453 | -0.014534 | 0.011424 | 0.007463 | 0.009662 |
6000 rows × 10 columns
pca.plot_2d(comp_id_1=0, comp_id_2=1)
pca.plot_2d(comp_id_1=1, comp_id_2=2)
Topic models can provide an interpretable, high-level model for the patterns and themes of documents. The main outputs of a Latent Dirichlet Allocation (LDA) model are a $\theta$ and $\phi$ table.
Some additional details are available from the scikit-learn website or Wikipedia
lda = TopicModel(remove_proper_nouns=True, max_features=6000, max_iter=25, n_topics=40, random_state=0)
lda.fit(corp)
lda.theta
| topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| work_id | chapter_id | |||||||||||||||||||||
| 0 | 0 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | ... | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 | 0.000135 |
| 1 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.040322 | 0.000106 | 0.416396 | 0.000106 | ... | 0.196527 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | 0.000106 | |
| 2 | 0.000133 | 0.000133 | 0.000133 | 0.072866 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.457596 | 0.000133 | ... | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | |
| 3 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | ... | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | 0.000133 | |
| 4 | 0.000068 | 0.000068 | 0.000068 | 0.048339 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | 0.499722 | 0.000068 | ... | 0.000068 | 0.000068 | 0.148245 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | 0.000068 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18 | 7 | 0.000013 | 0.000013 | 0.000013 | 0.418549 | 0.000013 | 0.000013 | 0.000013 | 0.000013 | 0.039973 | 0.000013 | ... | 0.000013 | 0.000013 | 0.026283 | 0.000013 | 0.000013 | 0.000013 | 0.000013 | 0.000013 | 0.000013 | 0.038287 |
| 8 | 0.000012 | 0.000012 | 0.000012 | 0.662574 | 0.000012 | 0.000012 | 0.000012 | 0.000012 | 0.059953 | 0.000012 | ... | 0.000012 | 0.005566 | 0.011198 | 0.000012 | 0.000012 | 0.000012 | 0.000012 | 0.000012 | 0.000012 | 0.077584 | |
| 9 | 0.000011 | 0.000011 | 0.000011 | 0.491197 | 0.000011 | 0.000011 | 0.000011 | 0.000011 | 0.046431 | 0.000011 | ... | 0.000011 | 0.197777 | 0.000011 | 0.000011 | 0.000011 | 0.000011 | 0.000011 | 0.000011 | 0.000011 | 0.100956 | |
| 10 | 0.000010 | 0.000010 | 0.000010 | 0.692856 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.105097 | 0.000010 | ... | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.040660 | |
| 11 | 0.000010 | 0.000010 | 0.000010 | 0.614628 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.052493 | 0.000010 | ... | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.000010 | 0.088510 |
268 rows × 40 columns
lda.phi.shape
(40, 6000)
lda.phi.head()
| term_str | abide | abides | abode | abodes | abomination | abroad | absence | absent | abundance | abuse | ... | young | younger | youth | youth death | youths | youths bowls | zeal | zenith | zephyrs | zeus |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| topic_id | |||||||||||||||||||||
| 0 | 0.0250 | 0.025000 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | ... | 0.025 | 0.025 | 0.025000 | 0.025 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 1.025 |
| 1 | 0.0250 | 0.025000 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | ... | 0.025 | 0.025 | 0.025000 | 0.025 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 |
| 2 | 0.0250 | 0.025000 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | ... | 0.025 | 0.025 | 1.025000 | 0.025 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 |
| 3 | 2.7821 | 4.978934 | 1.366457 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | ... | 0.025 | 0.025 | 19.886588 | 0.025 | 5.507344 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 |
| 4 | 0.0250 | 0.025000 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 | ... | 0.025 | 0.025 | 0.025000 | 0.025 | 0.025000 | 0.025 | 0.025 | 0.025 | 0.025 | 0.025 |
5 rows × 6000 columns
The below dataframe presents how often particular words appear in the top 10 words associated with each topic. There are 400 words total (10 words representing each topic x 40 topics), so the p column is n/400.
lda.get_top_words().head(15)
| n | p | |
|---|---|---|
| man | 19 | 0.0475 |
| gods | 18 | 0.0450 |
| men | 15 | 0.0375 |
| land | 14 | 0.0350 |
| house | 14 | 0.0350 |
| son | 13 | 0.0325 |
| heart | 12 | 0.0300 |
| father | 9 | 0.0225 |
| life | 9 | 0.0225 |
| hand | 8 | 0.0200 |
| city | 8 | 0.0200 |
| way | 6 | 0.0150 |
| sea | 6 | 0.0150 |
| mother | 6 | 0.0150 |
| death | 6 | 0.0150 |
Topic weights are calculated as the sum of each topic's column in the $\theta$ table.
lda.plot_topic_weights()
In this table, you can see which topics are most highly associated with each other. Darker cells indicate topics that are more highly associated with a given author. For example, Vergil (who has a single work in this corpus, the Aeneid), is highly associated with the topic that has top terms of "love, son, arms, blood, death, life, words, waves, sea, eyes" and "war, hand, sword, arms, foe, gods, way, thee, shield, spear". Given their equal weights, they could represent the two main sections of the Aeneid: the journey by sea to Italy after end of the Trojan War and the battles that took place in Italy after their arrival.
lda.author_topic.style.background_gradient()
| author | Aeschylus | Euripides | Hesiod | Homer | Homeric_Hymns | Ovid | Sophocles | Vergil | label |
|---|---|---|---|---|---|---|---|---|---|
| topic_id | |||||||||
| 0 | 0.024961 | 0.060186 | 0.000013 | 0.000019 | 0.003460 | 0.000014 | 0.013372 | 0.000012 | 0 son, ships, earth, mountain, land, horses, course, offspring, justice, gods |
| 1 | 0.000326 | 0.000192 | 0.000013 | 0.000019 | 0.059661 | 0.000014 | 0.013328 | 0.000012 | 1 men, goddess, life, power, chamber, crafts, success, prosperity, daughter goddess, city |
| 2 | 0.080109 | 0.000192 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.013403 | 0.000012 | 2 house, man, stranger, heart, news, message, grief, gods, friends, way |
| 3 | 0.003261 | 0.011123 | 0.000013 | 0.000809 | 0.001119 | 0.020192 | 0.007000 | 0.468083 | 3 war, hand, arms, gods, sword, way, foe, thee, power, death |
| 4 | 0.000326 | 0.030107 | 0.000013 | 0.000019 | 0.030445 | 0.000014 | 0.000318 | 0.000012 | 4 mother, woe, child, day, trouble, father, bitter, messenger, man, bringing |
| 5 | 0.001741 | 0.006428 | 0.000013 | 0.110083 | 0.012085 | 0.003280 | 0.013400 | 0.006165 | 5 ship, comrades, men, sea, thou, man, land, heart, ships, way |
| 6 | 0.001147 | 0.001078 | 0.000013 | 0.021948 | 0.022481 | 0.001098 | 0.000318 | 0.000864 | 6 ships, son, sons, host, men, counsel, king, war, hath, people |
| 7 | 0.000326 | 0.000192 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.000318 | 0.000012 | 7 abide, rebuke, refrain, reef, reeds, reed, red, recompense, reckless, recess |
| 8 | 0.317842 | 0.199719 | 0.068053 | 0.003439 | 0.002440 | 0.099289 | 0.439889 | 0.073289 | 8 man, father, gods, land, city, mother, house, death, hands, life |
| 9 | 0.000326 | 0.059973 | 0.000013 | 0.000019 | 0.006061 | 0.000332 | 0.013288 | 0.000012 | 9 gods, son, day, channels, horses, land, sea, tresses, blooms, shields |
| 10 | 0.040567 | 0.000192 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.000318 | 0.000012 | 10 hand, death, chorus, father, man, men, justice, speech, home, enemy |
| 11 | 0.000326 | 0.021908 | 0.000013 | 0.020653 | 0.016653 | 0.000014 | 0.007609 | 0.000012 | 11 wall, ships, men, son, war, twain, battle, horses, man, ground |
| 12 | 0.013316 | 0.002676 | 0.050224 | 0.000813 | 0.026601 | 0.502979 | 0.019503 | 0.024505 | 12 love, son, death, time, life, blood, waves, hand, words, father |
| 13 | 0.029430 | 0.041423 | 0.000013 | 0.000019 | 0.028801 | 0.000014 | 0.063624 | 0.000012 | 13 father, child, gods, man, daughter, death, help, life, hand, words |
| 14 | 0.000326 | 0.032862 | 0.000013 | 0.000019 | 0.062611 | 0.000014 | 0.066458 | 0.000012 | 14 man, men, city, gods, land, mother, place, way, head, seat |
| 15 | 0.061941 | 0.043992 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.013492 | 0.000012 | 15 woe, house, life, altar, child, goddess, ah, heart, head, army |
| 16 | 0.001189 | 0.032772 | 0.000013 | 0.000315 | 0.008495 | 0.000014 | 0.028251 | 0.000012 | 16 land, city, children, god, things, man, men, people, whichsoever, gods |
| 17 | 0.041456 | 0.025406 | 0.000013 | 0.000019 | 0.021253 | 0.333044 | 0.018248 | 0.304552 | 17 sea, love, land, arms, eyes, waves, words, way, voice, heart |
| 18 | 0.000326 | 0.000192 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.000318 | 0.000012 | 18 abide, rebuke, refrain, reef, reeds, reed, red, recompense, reckless, recess |
| 19 | 0.000481 | 0.000192 | 0.000013 | 0.028435 | 0.009551 | 0.000223 | 0.000822 | 0.000012 | 19 bow, wooers, house, hall, heart, man, son, hands, men, arrow |
| 20 | 0.020494 | 0.026554 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.000318 | 0.000012 | 20 house, fortune, wealth, prostrate ground, rise, misery, triumph, prostrate, escape, shout |
| 21 | 0.000756 | 0.000192 | 0.000013 | 0.000019 | 0.002414 | 0.000014 | 0.028067 | 0.000012 | 21 ah, ah ah, hope, race, chorus, house, power, life mortals, mortals, ruin |
| 22 | 0.011777 | 0.006012 | 0.157401 | 0.353482 | 0.015808 | 0.017677 | 0.013288 | 0.051761 | 22 thou, man, son, men, thee, heart, house, gods, land, wooers |
| 23 | 0.007558 | 0.047441 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.000753 | 0.000012 | 23 man, daughter, child, army, husband, letter, marriage, matter, goddess, hand |
| 24 | 0.022108 | 0.033591 | 0.000013 | 0.000019 | 0.001119 | 0.000377 | 0.049884 | 0.000012 | 24 child, death, corpse, mother, life, house, son, fate, man, wife |
| 25 | 0.000326 | 0.000192 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.026257 | 0.000012 | 25 life, father, house, voice, boy, word, tomb, son, grief, land |
| 26 | 0.138606 | 0.032334 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.007002 | 0.000012 | 26 gods, man, house, heart, men, home, wealth, time, hand, mortals |
| 27 | 0.003412 | 0.030130 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.030235 | 0.000012 | 27 lock, gods, land, place, city, son, ship, awaits, hand, turn |
| 28 | 0.044257 | 0.000192 | 0.000013 | 0.000019 | 0.029258 | 0.000014 | 0.015039 | 0.000012 | 28 man, words, father, god, land, things, theseus, arts, children, dearest |
| 29 | 0.010129 | 0.109866 | 0.000013 | 0.000019 | 0.041990 | 0.000076 | 0.006464 | 0.000012 | 29 house, men, father, gods, god, force, hands, city, land, son |
| 30 | 0.030035 | 0.000192 | 0.000013 | 0.026796 | 0.001119 | 0.000062 | 0.000606 | 0.004256 | 30 heart, land, house, thou, men, thee, raft, halls, gods, goddess |
| 31 | 0.000326 | 0.022141 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.005436 | 0.017638 | 31 shield, way, words, spear, slain, sword, troop, life, sea, stood |
| 32 | 0.026887 | 0.026952 | 0.723836 | 0.035122 | 0.511091 | 0.016808 | 0.028716 | 0.012677 | 32 gods, men, son, earth, sea, heart, goddess, daughter, deathless, god |
| 33 | 0.021016 | 0.001780 | 0.000013 | 0.022493 | 0.001119 | 0.000277 | 0.017698 | 0.000012 | 33 thou, land, city, father, thee, man, maidens, heart, mother, gods |
| 34 | 0.000326 | 0.087937 | 0.000013 | 0.003964 | 0.001119 | 0.000014 | 0.016354 | 0.000012 | 34 god, hands, man, house, women, rites, master, thyrsos, things, city |
| 35 | 0.020538 | 0.000192 | 0.000013 | 0.000019 | 0.002539 | 0.000014 | 0.000318 | 0.000012 | 35 bidding, gods, tales, things, sword, brand, lord, passions, arms, deeds |
| 36 | 0.000579 | 0.000192 | 0.000013 | 0.000019 | 0.030804 | 0.000014 | 0.000392 | 0.000012 | 36 gold, gods hands, song, gods, sea, waves sea, set, hands, neck, home |
| 37 | 0.020494 | 0.000192 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.000318 | 0.000012 | 37 house, heart, cloth, grief, blood, dreams, mother, life, sound, libations |
| 38 | 0.000326 | 0.000192 | 0.000013 | 0.000019 | 0.001119 | 0.000014 | 0.013798 | 0.000012 | 38 man, hand, men, goal, gods, time, arms, eyes, flocks, sword |
| 39 | 0.000326 | 0.002914 | 0.000013 | 0.371155 | 0.035352 | 0.003942 | 0.005780 | 0.035866 | 39 son, thou, man, spear, ships, men, battle, heart, horses, thee |
Word embedding algorithms produce representative vectors for words based on word co-occurrence statistics in various contexts. So, words which appear in similar contexts (are surrounded by similar words) should have similar vectors.
# Requires gensim v4.0+
w2v = WordEmbedding(OHCO_level=paragraph_bag)
w2v.fit(corp, seed=0, workers=4) # Seed does not guarantee complete reproducibility without some other parameters (e.g., workers=1), but should be more consistent
fit method above), but we can visualize them to some degree in 2 dimensions using the t-distributed stochastic neighor embedding (t-SNE) method.Looking through these, it's possible to notice how some related words can cluster together:
w2v.plot_tsne(random_state=0)
It's also possible add and subtract word vectors to produce "word analogies" that can sometimes produce interesting results.
w2v.word_analogy('war', 'bronze', 'peace', n=10)
| term | sim | |
|---|---|---|
| 0 | gold | 0.985048 |
| 1 | held | 0.980114 |
| 2 | head | 0.979048 |
| 3 | took | 0.978848 |
| 4 | smote | 0.978692 |
| 5 | ground | 0.978490 |
| 6 | shoulders | 0.977975 |
| 7 | stood | 0.977782 |
| 8 | fell | 0.977330 |
| 9 | cast | 0.977138 |
w2v.word_analogy('sea', 'death', 'earth', n=10)
| term | sim | |
|---|---|---|
| 0 | life | 0.976423 |
| 1 | heart | 0.973512 |
| 2 | deed | 0.973239 |
| 3 | mine | 0.970396 |
| 4 | grief | 0.970218 |
| 5 | love | 0.969827 |
| 6 | one | 0.968849 |
| 7 | said | 0.966848 |
| 8 | husband | 0.966678 |
| 9 | answer | 0.966468 |
w2v.word_analogy('sword', 'shield', 'chariot')
| term | sim | |
|---|---|---|
| 0 | horses | 0.996570 |
| 1 | stood | 0.994414 |
| 2 | gold | 0.991702 |
| 3 | took | 0.991255 |
| 4 | air | 0.989980 |
| 5 | held | 0.989638 |
| 6 | round | 0.989574 |
| 7 | drew | 0.989304 |
| 8 | ground | 0.988584 |
| 9 | shoulders | 0.988233 |
The sentiment analysis performed here uses a lexicon-based approach (NRC Emotion Lexicon).
nrc_lexicon_path = os.path.join(data_dir, 'lexicons', 'salex_nrc.csv')
sa = SentimentAnalysis(nrc_lexicon_path)
sa.fit(corp)
sa.vocab
| n | stop | p_stem | pos_max | df | idf | tfidf_n_sum | tfidf_max_sum | tfidf_bool_sum | anger | anticipation | disgust | fear | joy | negative | positive | sadness | surprise | trust | polarity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| term_str | ||||||||||||||||||||
| god | 843 | 0 | god | NN | 166 | 0.691050 | 5.509721 | 4.030273 | 5.546902 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 |
| death | 771 | 0 | death | NN | 154 | 0.799303 | 5.828521 | 4.303188 | 5.199140 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | -1 |
| words | 751 | 0 | word | NNS | 159 | 0.753206 | 5.349911 | 4.143198 | 5.119533 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | -1 |
| spear | 705 | 0 | spear | NN | 92 | 1.542527 | 10.285246 | 4.870976 | 4.521931 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | -1 |
| war | 660 | 0 | war | NN | 91 | 1.558295 | 9.727163 | 4.885061 | 4.996012 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | -1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| raid | 1 | 0 | raid | NN | 1 | 8.066089 | 0.076288 | 0.279828 | 0.481721 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | -1 |
| badness | 1 | 0 | bad | NN | 1 | 8.066089 | 0.076288 | 0.267660 | 0.091553 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | -1 |
| symmetry | 1 | 0 | symmetri | NN | 1 | 8.066089 | 0.076288 | 0.278352 | 0.462548 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 |
| sickly | 1 | 0 | sickli | NN | 1 | 8.066089 | 0.076288 | 0.267975 | 0.082642 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | -1 |
| aggression | 1 | 0 | aggress | NN | 1 | 8.066089 | 0.076288 | 0.267926 | 0.082240 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | -1 |
2000 rows × 20 columns
sa.works.plot.bar(figsize=(25, 8))
<AxesSubplot:>
sa.plot_mean_sentiments(author='homer')
sa.plot_mean_sentiments(author='aeschylus')
sa.plot_mean_sentiments(work_title='homeric hymns')
sa.plot_mean_sentiments(work_title="aeneid")
Cells here are colored based on sentiment value in each chapter. Darker colors indicate more intense emotion for a particular column.
sa.get_chapter_table(work_title="iliad").style.background_gradient(cmap="YlGn")
| anger | anticipation | disgust | fear | joy | sadness | surprise | trust | polarity | ||
|---|---|---|---|---|---|---|---|---|---|---|
| work_id | chapter_id | |||||||||
| 9 | 0 | 1.730861 | 1.214255 | 0.972059 | 1.817043 | 1.318384 | 1.101758 | 0.438503 | 1.800535 | -0.614109 |
| 1 | 1.569692 | 1.009844 | 0.901147 | 1.873647 | 1.077041 | 1.634125 | 0.565258 | 1.986575 | -0.609755 | |
| 2 | 1.580227 | 1.289846 | 0.814903 | 1.495365 | 1.392562 | 0.990126 | 0.461849 | 1.489961 | -0.287252 | |
| 3 | 2.181589 | 1.192394 | 0.764923 | 1.963697 | 1.214398 | 0.923044 | 0.488568 | 1.694840 | -0.998156 | |
| 4 | 3.262166 | 1.549703 | 1.032995 | 3.443559 | 1.272826 | 1.872256 | 0.624747 | 1.753074 | -2.597854 | |
| 5 | 1.518744 | 1.276081 | 0.829029 | 1.754722 | 1.390495 | 1.209623 | 0.467575 | 1.465352 | -0.494542 | |
| 6 | 2.123177 | 1.280881 | 0.460440 | 2.238135 | 1.101224 | 1.024433 | 0.431435 | 1.667092 | -1.048688 | |
| 7 | 1.844810 | 1.103386 | 0.707958 | 1.946045 | 1.180164 | 1.467844 | 0.724006 | 1.595467 | -1.019361 | |
| 8 | 1.473605 | 1.335805 | 0.995624 | 1.404845 | 1.641407 | 1.443460 | 0.643026 | 1.792408 | 0.043342 | |
| 9 | 1.312702 | 1.366772 | 0.599242 | 1.825061 | 1.162234 | 1.134426 | 0.394226 | 1.976554 | 0.082191 | |
| 10 | 2.748826 | 1.487001 | 1.016689 | 2.610600 | 1.138634 | 1.887580 | 0.698926 | 1.615874 | -1.832680 | |
| 11 | 2.160222 | 0.813262 | 0.668921 | 2.208396 | 0.705244 | 1.118996 | 0.405116 | 1.301246 | -1.521955 | |
| 12 | 2.833502 | 1.647424 | 0.997055 | 3.049750 | 1.244460 | 1.836388 | 0.681599 | 1.801621 | -1.829589 | |
| 13 | 1.845478 | 1.017470 | 0.496210 | 1.725659 | 1.223308 | 1.368212 | 0.584145 | 1.446289 | -0.423964 | |
| 14 | 2.487287 | 1.050446 | 0.739470 | 2.572164 | 0.929834 | 1.559014 | 0.530354 | 1.647100 | -1.973406 | |
| 15 | 3.144302 | 1.535136 | 1.188081 | 3.273769 | 1.128839 | 2.296683 | 0.461497 | 1.583914 | -2.597980 | |
| 16 | 2.751813 | 1.360859 | 1.373518 | 2.889983 | 1.294970 | 2.315920 | 0.630066 | 1.962683 | -2.585254 | |
| 17 | 1.522092 | 1.142675 | 0.802998 | 1.861608 | 1.271713 | 1.450946 | 0.799835 | 1.349188 | -0.831561 | |
| 18 | 1.487216 | 0.979899 | 0.651299 | 1.550720 | 1.145367 | 1.627072 | 0.464176 | 1.503287 | -0.637740 | |
| 19 | 2.362125 | 1.280978 | 0.688755 | 2.246929 | 1.010298 | 1.151469 | 0.363628 | 1.245117 | -1.809305 | |
| 20 | 2.050637 | 1.400315 | 1.145400 | 2.378893 | 1.142389 | 1.372184 | 0.508895 | 1.171902 | -1.699190 | |
| 21 | 1.580056 | 1.369503 | 0.763420 | 1.987672 | 1.169273 | 1.519195 | 0.451642 | 1.288005 | -0.991747 | |
| 22 | 1.785274 | 1.165313 | 0.943263 | 1.731714 | 1.353458 | 1.600835 | 0.746350 | 1.975793 | -0.242871 | |
| 23 | 1.976006 | 1.245570 | 0.954946 | 2.422350 | 1.373259 | 1.900938 | 0.653599 | 1.771831 | -0.640793 |
# # Make output directories if they don't already exist
# save_dirs = ['corpus', 'pca', 'topic-model', 'word-embeddings', 'sentiment-analysis']
# for d in save_dirs:
# d_path = os.path.join(output_dir, d)
# try:
# os.mkdir(d_path)
# except FileExistsError:
# continue
# # Save corpus tables
# corp.save_tables(os.path.join(output_dir, 'corpus'))
# # Save PCA tables
# pca_dir = os.path.join(output_dir, 'pca')
# pca.dcm.to_csv(os.path.join(pca_dir, 'DCM.csv'))
# pca.loadings.to_csv(os.path.join(pca_dir, 'LOADINGS.csv'))
# # Save Topic Model tables
# lda_dir = os.path.join(output_dir, 'topic-model')
# lda.theta.to_csv(os.path.join(lda_dir, 'THETA.csv'))
# lda.phi.to_csv(os.path.join(lda_dir, 'PHI.csv'))
# # Save Word Embedding tables
# w2v_dir = os.path.join(output_dir, 'word-embeddings')
# w2v.vectors.to_csv(os.path.join(w2v_dir, 'word2vec-EMBEDDINGS.csv'))
# # Save Sentiment Analysis tables
# sa_dir = os.path.join(output_dir, 'sentiment-analysis')
# sa.vocab.to_csv(os.path.join(sa_dir, 'VOCAB-sentiment.csv'))
# sa.bow.to_csv(os.path.join(sa_dir, 'DOC-sentiment.csv'))